MacFormat 2000 July

home *** CD-ROM | disk | FTP | other *** search

/ MacFormat 2000 July / macformat-092.iso / Dreamweaver 3 / Configuration / Commands / Clean Up Word HTML.js < prev next >

Wrap

Text File | 1999-12-01 | 50.4 KB | 2,016 lines

// // Copyright 1999 Macromedia, Inc. All rights reserved. // -------------------------------------------------------------------------- // // Import Word HTML.js // // This command is similar to the "Clean Up HTML.js" command, except that it // is specifically designed to work with HTML documents generated by // Microsoft Word. // // -------------------------------------------------------------------------- //******************* Commands API ******************* function commandButtons() { return new Array( BTN_OK, "importWordHTML()", // main entry point BTN_Cancel, "window.close()", BTN_Help, "displayHelp()"); } function canAcceptCommand() { return (dw.getFocus() == 'document'); } //******************* Global Variables ******************* var helpDoc = MM.HELP_cmdCleanUpWordHTML; var gWordVersion; var CB; // The checkbox group var CBTags; // The Checkboxes in the dialog // Logging vars var gRemoveMetaTags = 0; var gRemoveWordXML = 0; var gRemoveConditionals = 0; var gRemoveEmptyParas = 0; var gRemoveMargins = 0; var gRemoveInlineCSS = 0; var gRemovemsoStyle = 0; var gRemoveNonCSS = 0; var gRemoveTableCSS = 0; var gRemoveUnusedCSS = 0; var gFontsConverted = 0; var gNestingFixed = 0; var gBackgroundSet = ""; var gSourceFormatted = 0; //************** Main functions ********************* ///////////////////////////////////////////////////////////////////////////// // Function // importWordHTML // // Purpose // This is the "main" function that the dialog calls when the user // clicks OK. // function importWordHTML() { T.finish(); //ensure Tabs are through getting input // Lets save the settings first. That way if something crashes or // goes wrong during the processing, the user doesn't need to reset // all of the options again. if(doSaveSettings()) saveSettings(); // Set up logging particulars if ( doShowLog() ) { MM_enableLogging(); MM_clearLog(); } else { MM_disableLogging(); } version = getVersion(); switch(version) { case "2000": ProcessWord2000(); break; case "97": ProcessWord97(); break; } MM.setBusyCursor(); // Do some processing that needs to be done no matter the version. GeneralProcessing(); // Cleanup PostProcess(); MM.clearBusyCursor(); // Show the log, if they said to. finish(); } ///////////////////////////////////////////////////////////////////////////// function GeneralProcessing() { if(doRemoveMetaLink()) removeMetaLink(); if(doSetBgColor()) setBgColor(); } ///////////////////////////////////////////////////////////////////////////// // Function // PostProcess // // Purpose // Anything that needs to be done after we have done all of the cleaning // should be done in here. This gets run no matter what options are // turned on. // function PostProcess() { var root = dw.getDocumentDOM('document').documentElement; var html; // Remove the blank style="" attributes. html = root.outerHTML; html = html.replace(/\s*style=(""|'')/g, ""); root.outerHTML = html; RemoveEmptyTags(); if(doApplySourceFormatting()) { // Included from "Source Formatting.js". formatSource(); gSourceFormatted = 1; } } ///////////////////////////////////////////////////////////////////////////// // Function // initialize // // Purpose // This is called from body onLoad to initialize the dialog. // function initialize() { getCheckboxNames(); // Initialize the checkboxes. initCheckboxes(); // Ok, we have hooked up all of the checkboxes. Now we need to set // them to initial values of some kind. setCheckboxStates(); //Initialize the TabControl. (Pass in the prefix used for the tab layers) T = new TabControl('Tab'); //Add tab pages. (Pass the layer name, and the page object) T.addPage('basic', new Pg1(LABEL_Basic)); T.addPage('detailWord2000', new Pg2(LABEL_Detailed)); T.addPage('detailWord97', new Pg3(LABEL_Detailed)); T.addGroup("group2000", new Array("basic","detailWord2000")); T.addGroup("group97", new Array("basic","detailWord97")); //Show default group T.showGroup("group97"); //Initialize and display the tabs. (Could pass the name of a page to start on) T.start(); // Determine what version of Word this thing came from. detectWordVersion(); setWordVersion(); } ///////////////////////////////////////////////////////////////////////////// // Function // setDropDownStates // // Purpose // Retrieve the settings for the font drop downs from the MetaFile. // function setDropDownStates() { var path = document.URL; var name, font, value, valueStr; var metaFile; metaFile = MMNotes.open(path, false); if(metaFile != 0) { // We have some stored settings. Set the checkboxes based on them. for(i = 1; i <= 7; i++) { name = "menuSize" + i; font = findObject(name); valueStr = MMNotes.get(metaFile, name); if (font != null && valueStr) { value = parseInt(valueStr); if((valueStr == value.toString) && (value >= 0) && (value < fontValues.length)) font.selectedIndex = value; } } // We are done with the file, close it. MMNotes.close(metaFile); metaFile = 0; } // Note that the dropdowns are initialized to some default settings // in initDropDowns(). So if there are no keys in the metafile, // the dropdowns will be set to something appropriate. } ///////////////////////////////////////////////////////////////////////////// // Function // initDropDowns // // Purpose // Initialize the convert font sizes drop downs with the array that // is defined in the .htm file. // function initDropDowns() { var i, j; var font; var select; for(i = 1; i <= 7; i++) { font = findObject("menuSize"+i); // Select something appropriate as a default. switch(i) { case 1: select = 6; break; case 2: select = 7; break; case 3: select = 1; break; case 4: case 5: case 6: case 7: select = 0; break; } if(font != null) loadSelectList(font, fontValues, true, select); } } ///////////////////////////////////////////////////////////////////////////// // Function // getCheckboxNames // // Purpose // Search through our dialog and find all of our "checkboxes" and store // their names. This way, if we add or delete them, we don't have to // do as much maintainence. // function getCheckboxNames() { CBTags = new Array(); traverse(document.documentElement, getCheckboxes); } ///////////////////////////////////////////////////////////////////////////// // Function // getCheckboxes // // Purpose // This finds each "checkbox" input in our dialog and adds the tag // to our list. This way we can easily search the checkboxes. // function getCheckboxes(tag) { var tagName = tag.tagName.toUpperCase(); if(tagName != "INPUT") return true; click = tag.getAttribute("onClick"); if(click != null && click.match(/CB\.clicked/)) { // This is a checkbox add its name to the list. CBTags.push(tag); } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // initCheckboxes // // Purpose // Create and hookup the hierarchical checkboxes. Note, the hierarchy is // stored in the "parents" attribute on the checkbox in the HTML. This // way, the HTML file sepecifies the relationship and the javascript // doesn't care. Saves us maintainence time when changing the checkboxes. // function initCheckboxes() { CB = new CheckboxSet(); // Add the "parent" checkboxes first because they need to exist // for the child checkboxes to name them as a parent. Parent // checkboxes have no "parents" attribute. for(i = 0; i < CBTags.length; i++) { parents = CBTags[i].getAttribute("parents"); if(parents == null || parents == "") CB.addCheckbox(CBTags[i].getAttribute("name")); } // Now that all of the parent checkboxes have been registered, // we can now register the child checkboxes. for(i = 0; i < CBTags.length; i++) { parents = CBTags[i].getAttribute("parents"); if(parents != null && parents != "") { CB.addCheckbox(CBTags[i].getAttribute("name"), CBTags[i].getAttribute("parents")); } } } ///////////////////////////////////////////////////////////////////////////// // Function // setCheckboxStates // // Purpose // Set the checkbox states based on the saved defaults, or if we don't // have any saved defaults, set the checkboxes to our hard coded // defaults. // function setCheckboxStates() { if(setCheckboxStatesFromSavedDefaults()) return; // Default settings. Turn all options on by default. for(i = 0; i < CBTags.length; i++) CB.check(CBTags[i].getAttribute("name"), true); } ///////////////////////////////////////////////////////////////////////////// // Function // setCheckboxStatesFromSavedDefaults // // Purpose // Set the checkboxes based on the defaults that we have saved. // // Returns // true if we were able to read info from the metafile. false if we // could not read the metafile (didn't exist, etc) and we should set // some defaults ourselves. // function setCheckboxStatesFromSavedDefaults() { var path = document.URL; var metaFile = MMNotes.open(path, false); if(metaFile != 0) { // We have some stored settings. Set the checkboxes based on them. var keys = MMNotes.getKeys(metaFile); var i, j; for(i = 0; i < keys.length; i++) { for(j = 0; j < CBTags.length; j++) { if(keys[i] == CBTags[j].getAttribute("name")) { CB.check(CBTags[j].getAttribute("name"), true); break; } } } // We are done with the file, close it. MMNotes.close(metaFile); metaFile = 0; return true; } else { // No settings to read. return false; } return false; } ///////////////////////////////////////////////////////////////////////////// // Function // finish // // Purpose // We are done. Do any last minute stuff and show any log information // that user may have requested. // function finish() { // Show what we did if show log is enabled if ( doShowLog() ) { var bDidSomething = ( (gRemoveMetaTags > 0) || (gRemoveWordXML > 0) || (gRemoveConditionals > 0) || (gRemoveEmptyParas > 0) || (gRemoveMargins > 0) || (gRemoveInlineCSS > 0) || (gRemovemsoStyle > 0) || (gRemoveNonCSS > 0) || (gRemoveTableCSS > 0) || (gRemoveUnusedCSS > 0) || (gFontsConverted > 0) || (gNestingFixed > 0) || (gBackgroundSet != "") || (gSourceFormatted > 0) ); MM_note(MSG_TrcSummaryHeader); if(bDidSomething) { if(gRemoveMetaTags > 0) MM_note(MSG_TrcRemoveMetaTags, gRemoveMetaTags); if(gRemoveWordXML > 0) MM_note(MSG_TrcRemoveWordXML, gRemoveWordXML); if(gRemoveConditionals > 0) MM_note(MSG_TrcRemoveConditionals, gRemoveConditionals); if(gRemoveEmptyParas > 0) MM_note(MSG_TrcRemoveEmptyParas, gRemoveEmptyParas); if(gRemoveMargins > 0) MM_note(MSG_TrcRemoveMargins, gRemoveMargins); if(gRemoveInlineCSS > 0) MM_note(MSG_TrcRemoveInlineCSS, gRemoveInlineCSS); if(gRemovemsoStyle > 0) MM_note(MSG_TrcRemovemsoStyle, gRemovemsoStyle); if(gRemoveNonCSS > 0) MM_note(MSG_TrcRemoveNonCSS, gRemoveNonCSS); if(gRemoveTableCSS > 0) MM_note(MSG_TrcRemoveTableCSS, gRemoveTableCSS); if(gRemoveUnusedCSS > 0) MM_note(MSG_TrcRemoveUnusedCSS, gRemoveUnusedCSS); if(gFontsConverted > 0) MM_note(MSG_TrcFontsConverted, gFontsConverted); if(gNestingFixed > 0) MM_note(MSG_TrcNestingFixed, gNestingFixed); if(gBackgroundSet != "") MM_note(MSG_TrcBackgroundSet, gBackgroundSet); if(gSourceFormatted > 0) MM_note(MSG_TrcSourceFormatted); } else { MM_note( MSG_TrcDidNothing ); } MM_showLog(); } window.close(); } ///////////////////////////////////////////////////////////////////////////// // Function // saveSettings // // Purpose // Save the options that the user has selected so that the next time // they use this dialog, it will have their last settings. // function saveSettings() { var path = document.URL; var metaFile = MMNotes.open(path, true); var name; if(metaFile == 0) { alert(wrapTextForAlert(MSG_metaFileError, 80)); return; } // Make sure the meta file does not contain stale information. clearMetaFile(metaFile); // Now set a key for each option that is on. for(i = 0; i < CBTags.length; i++) { name = CBTags[i].getAttribute("name"); if(CB.isChecked(name)) MMNotes.set(metaFile, name, "1"); } // Now, save the state of the "convert fonts" things. for(i = 1; i <= 7; i++) { name = "menuSize" + i; font = findObject(name); if(font != null) MMNotes.set(metaFile, name, font.selectedIndex+""); } MMNotes.close(metaFile); metaFile = 0; } ///////////////////////////////////////////////////////////////////////////// // Function // clearMetaFile // // Purpose // Clear the metafile so that we don't have stale info in there. // function clearMetaFile(metaFile) { if(metaFile == 0) return; var keys = MMNotes.getKeys(metaFile); for(i = 0; i < keys.length; i++) MMNotes.remove(metaFile, keys[i]); } // ----------- Autodetection routines ---------------- ///////////////////////////////////////////////////////////////////////////// // Function // detectWordVersion // // Purpose // Find out what version of Word the document was generated by. We do // this so that the user doesn't need to worry about it. // function detectWordVersion() { var bFoundVersion = true; // Init gWordVersion gWordVersion = -1; // This will set 'gWordVersion' if it finds anything traverse(null, findVersionInMetaTag); if(gWordVersion == -1) { // We could not determine the version of Word used to generate this // document. Default to Word 2000. gWordVersion = 2000; bFoundVersion = false; } // Set the dropdown to have what we have detected. with(findObject("selectWordVersion")) { for(i = 0; i < options.length; i++) { if(options[i].value == gWordVersion) { selectedIndex = i; break; } } } if(!bFoundVersion) { // Notify the user that we were unable to determine the version of Word. alert(wrapTextForAlert(MSG_UnknownVersion, 80)); } return gWordVersion; } ///////////////////////////////////////////////////////////////////////////// // Function // findVersionInMetaTag // // Purpose // This is a callback from traverse. This function is the meat of // finding the version of Word used to generate the HTML. We look // at the meta tags and find the one that gives the version of Word. // function findVersionInMetaTag(tag) { tagName = tag.tagName; if(tagName.toUpperCase() == "META") { name = tag.getAttribute("NAME"); if(name != null && name.toUpperCase() == "GENERATOR") { content = tag.getAttribute("CONTENT"); if(content != null) { if(content.search(/word 97/i) >= 0) { gWordVersion = 97; return false; } else if(content.search(/word 81/i) >= 0) { gWordVersion = 97; return false; } else if(content.search(/word 9/i) >= 0) { gWordVersion = 2000; return false; } } } } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // setWordVersion // // Purpose // This gets called when the user selects a different version of Word // from the drop down list. This grabs the selected value from the // drop down and shows the appropriate options by hiding or showing // the different layers. // function setWordVersion() { version = getVersion(); if(version != gWordVersion) { // The user is trying to select the options for a version of // word that does not match what we think it is. Allow them // to to do this, but warn them that in doing so the import // may not work since the algorithms for the different versions // are different. alert(wrapTextForAlert(MSG_DiffWordVersion, 80)); } switch(version) { case "2000": T.showGroup("group2000"); break; case "97": T.showGroup("group97"); break; default: alert(wrapTextForAlert(MSG_Error, 80)); break; } T.refresh(); } ///////////////////////////////////////////////////////////////////////////// // Function // getVersion // // Purpose // Quickie function to get the version from the selected version // function getVersion() { with(findObject("selectWordVersion")) { return options[selectedIndex].value; } } // ----- Check functions ---------------------------------------------------- ///////////////////////////////////////////////////////////////////////////// function doRemoveMetaLink() { switch(getVersion()) { case "2000": return CB.isChecked("removeMetaLink2000_detail"); break; case "97": return CB.isChecked("removeMetaLink97_detail"); break; } return false; } ///////////////////////////////////////////////////////////////////////////// function doConvertSize(size) { switch(size) { case "7": return CB.isChecked("convertSize7_detail"); case "6": return CB.isChecked("convertSize6_detail"); case "5": return CB.isChecked("convertSize5_detail"); case "4": return CB.isChecked("convertSize4_detail"); case "3": return CB.isChecked("convertSize3_detail"); case "2": return CB.isChecked("convertSize2_detail"); case "1": return CB.isChecked("convertSize1_detail"); } return false; } ///////////////////////////////////////////////////////////////////////////// // Function // getDesiredFontSize // // Purpose // Given a size, find out what the user has specified to change that // size to. This queries the dropdown box associated with the given // size. // function getDesiredFontSize(size) { var option; if(size.length != 1) return "-1"; if(size[0] < '1' || size[0] > '7') return "-1"; option = findObject("menuSize"+size); if(option != null) return option.options[option.selectedIndex].value; return "-1"; } ///////////////////////////////////////////////////////////////////////////// function doRemoveXMLFromHTML() { return CB.isChecked("removeXMLHTML2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveXMLMarkup() { return CB.isChecked("removeXMLmarkup2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveIfs() { return CB.isChecked("removeIf2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveEmptyParas() { return CB.isChecked("removeEmptyPara2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveInlineCSS() { return CB.isChecked("removeInlineCSS2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveMSOStyleAttr() { return CB.isChecked("removeInlineCSS2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveNonCSSDeclaration() { return CB.isChecked("removeNonCSS2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveCSSFromTables() { return CB.isChecked("removeCSSTable2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doRemoveUnusedStyles() { return CB.isChecked("removeUnusedCSS2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doFixInvalidNesting() { return CB.isChecked("removeInlineCSS2000_detail"); } ///////////////////////////////////////////////////////////////////////////// function doSetBgColor() { return CB.isChecked("setBgColor_basic"); } ///////////////////////////////////////////////////////////////////////////// function doApplySourceFormatting() { return CB.isChecked("applyFormatting_basic"); } ///////////////////////////////////////////////////////////////////////////// function doShowLog() { return CB.isChecked("showLog_basic"); } ///////////////////////////////////////////////////////////////////////////// function doSaveSettings() { return true; } ///////////////////////////////////////////////////////////////////////////// // Function // ProcessWord2000 // // Purpose // This is the main function for doing Word 2000 processing on the // document. // function ProcessWord2000() { if(doRemoveXMLFromHTML()) removeXMLFromHTML(); if(doRemoveXMLMarkup()) removeXMLMarkup(); if(doRemoveIfs()) removeIfs(); if(doRemoveMSOStyleAttr()) removeMSOStyleAttr(); if(doRemoveEmptyParas()) removeEmptyParas(); if(doRemoveCSSFromTables()) removeCSSFromTables(); if(doRemoveNonCSSDeclaration()) removeNonCSSDeclaration(); if(doRemoveInlineCSS()) removeInlineCSS(); if(doRemoveUnusedStyles()) removeUnusedStyles(); // We are done. Do some general cleanup formatCSS(); } ///////////////////////////////////////////////////////////////////////////// function RemoveEmptyTags() { var body = findTag("body"); var emptyTags = new Array(); var tag; // First find all of the tags that are empty inside the body. traverse(body, findEmptyTags, null, null, emptyTags); // Now remove them. while((tag = emptyTags.pop()) != null) { if (dw.nodeExists(tag)) tag.outerHTML = tag.innerHTML; } } ///////////////////////////////////////////////////////////////////////////// function findEmptyTags(tag, emptyTags) { var tagName = tag.tagName; var html; var regx; var result; switch(tagName.toUpperCase()) { // Add new empty tags to be removed here. case "DIV": case "SPAN": case "FONT": // Do a match to see if the tag is empty (no attributes) // If it is add it to the list of empty tags that we // will remove from the doc. html = tag.outerHTML; regx = new RegExp("<"+tagName+">", "i"); result = regx.exec(html); if(result != null && result.index != -1) emptyTags.push(tag); break; } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // ProcessWord97 // // Purpose // This is the main function for doing Word 97 processing on the // document. // function ProcessWord97() { if(doRemoveMetaLink()) removeMetaLink(); convertFontSizes(); if(doFixInvalidNesting()) fixInvalidNesting(); } function convertFontSizes() { traverse(null, convertFontSizeHandler); // post processing will strip the empty <font> tags. } ////////////////////////////////////////////////////////////////////////////// // Function // convertFontSizeHandler // // Purpose // Callback that searches for font tags to convert. // function convertFontSizeHandler(tag) { if(tag.tagName.toUpperCase() == "FONT") { var size = tag.getAttribute("size"); var desiredSize; if(size != null && doConvertSize(size)) { desiredSize = getDesiredFontSize(size); switch(desiredSize) { case "-1": // don't change anything break; case "0": // use default size tag.removeAttribute("size"); gFontsConverted++; break; case "1": case "2": case "3": case "4": case "5": case "6": case "7": tag.setAttribute("size", desiredSize); gFontsConverted++; break; case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": // If this font tag is not contained within another block tag, // we can convert it to a header. If the font is contained // within a block tag, it is an inline font size change. We // don't want to convert those to headers since headers create // vertical white space. if(!isInsideTag(tag, "p,h1,h2,h3,h4,h5,h6")) { // We remove the size attribute from the <font> tag // and wrap the font tag and all its content with // the appropriate heading. tag.removeAttribute("size"); html = tag.outerHTML; // Strip any internal <p>'s that we might have. We don't // need them since we are converting this to a header. html = html.replace(/<\/?P[^>]*>/ig, ""); html = "<"+desiredSize+">" + html + "</"+desiredSize+">"; tag.outerHTML = html; gFontsConverted++; // Note, we could be leaving behind an empty <font> tag. // But, this is OK. The general post processing will // clean these up. } break; } } } // Keep traversing... return true; } ///////////////////////////////////////////////////////////////////////////// // Function // fixInvalidNesting // // Purpose // Word 97 has no clue about HTML structure. Most HTML documents that // it generates have overlapped tags, and invalid nesting structures. // This function aims to clean up that mess. // // Note! This is a very specialized case for Word 97. This will // not fix all general cases of invalid nesting. // function fixInvalidNesting() { traverse(null, fixHandler); traverse(null, removeMarkedTags); } ///////////////////////////////////////////////////////////////////////////// // Function // fixHandler // // Purpose // Callback for fixing up invalidly nested tags. This is very specific // to how Word 97 generates its HTML. This will NOT fix any general // case of invalid HTML (that problem is actually quite difficult). // function fixHandler(tag, nestingFixes) { var html; var tagName = tag.tagName.toUpperCase(); // If this is a <p> or a header, we need to do some work. if(tagName == "P" || tagName == "LI" || (tagName.match(/h[1-6]/i) != null)) { // Fix up them tags var pCase = tag.tagName; // maintain upper/lower case var parent = tag.parentNode; var innerMostHTML = tag.innerHTML; while(parent != null) { if(parent.tagName) { switch(parent.tagName.toUpperCase()) { case "FONT": case "B": case "I": parent.removeAttribute("TO_BE_DELETED"); html = parent.outerHTML; parent.setAttribute("TO_BE_DELETED",true); // We use match here to make sure we maintain any tag attributes. startTag = html.match(/<[^>]*>/); if(startTag != null) { innerMostHTML = startTag[0] + innerMostHTML + "</" + parent.tagName + ">"; } break; } } parent = parent.parentNode; } tag.innerHTML = innerMostHTML; //actually change the internal tag } return true; } function removeMarkedTags(tag) { if (tag.getAttribute("TO_BE_DELETED")) { tag.outerHTML = tag.innerHTML; //blow away outer tag } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // removeXMLFromHTML // // Purpose // Word puts some useless XML markup in the start <html> tag, nuke it. // function removeXMLFromHTML() { var root = getRootNode(); var html = root.outerHTML; // We have 2 submatches, "<html", everything after "<html" to the ending // ">". We want to throw out everything between "<html" and the end. // So, we will just keep $1. html = html.replace(/(<html)([^>]*)/, "$1"); root.outerHTML = html; } ///////////////////////////////////////////////////////////////////////////// // Function // removeXMLMarkup // // Purpose // Word puts some random, useless XML markup in the body. Strip it. // function removeXMLMarkup() { var root = dw.getDocumentDOM('document').documentElement; var html = root.outerHTML; if(doShowLog()) { var match; match = html.match(/<o:p>/g); gRemoveWordXML += (match != null ? match.length : 0); match = html.match(/<\/o:p>/g); gRemoveWordXML += (match != null ? match.length : 0); } // Remove all instances of <o:p></o:p> html = html.replace(/<o:p>/g, ""); html = html.replace(/<\/o:p>/g, ""); // If we find any other instances of XML markup, we can add it here. root.outerHTML = html; } ///////////////////////////////////////////////////////////////////////////// // Function // removeIfs // // Purpose // Word uses many <![if...]> style comments for its own internal // purposes, which are useless in HTML. This function strips those. // function removeIfs() { traverse(null, null, null, ifHandler); var root = dw.getDocumentDOM('document').documentElement; var html = root.outerHTML; // clean up those empty comments! html = html.replace(/<!-*>/g, ""); root.outerHTML = html; } ///////////////////////////////////////////////////////////////////////////// // Function // ifHandler // // Purpose // Find those pesky "if" conditionals that Word 2000 puts in the HTML // and nuke'em. // function ifHandler(comment) { var html = comment.data; var matchif = html.match(/\[if /); var matchendif = html.match(/\[endif/); if(matchif != null || matchendif != null) { gRemoveConditionals++; comment.data = ""; } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // removeMSOStyleAttr // // Purpose // Microsoft Word uses many custom CSS attributes. This function hunts // them down and removes them. // function removeMSOStyleAttr() { var root = dw.getDocumentDOM('document').documentElement; var html = root.outerHTML; RegExp.multiline = true; if(doShowLog()) { // NOTE! This is highly ineffiecient since we are doing the regexp // searchs twice (once to count, once to do the actual replaces). // If there is a better way to know how many times a replace() // does its thing, we should do that. var match; match = html.match(/mso-[^:]*:"[^"]*";/g, ""); if (match) gRemovemsoStyle += match.length; match = html.match(/mso-[^;'"]*;*(\n|\r)*/g, ""); if (match) gRemovemsoStyle += match.length; match = html.match(/page-break-after[^;]*;/g, ""); if (match) gRemovemsoStyle += match.length; match = html.match(/ style=['"]tab-interval:[^'"]*['"]/g, ""); if (match) gRemovemsoStyle += match.length; } // This finds the mso-*:"SomeStuff"; style attributes and sets them to be nothing. html = html.replace(/mso-[^:]*:"[^"]*";/g, ""); // This finds the other mso-* style attibutes. html = html.replace(/mso-[^;'"]*;*(\n|\r)*/g, ""); // Remove some other Word-only css style attributes. html = html.replace(/page-break-after[^;]*;/g, ""); html = html.replace(/ style=['"]tab-interval:[^'"]*['"]/g, ""); root.outerHTML = html; } ///////////////////////////////////////////////////////////////////////////// // Function // removeEmptyParas // // Purpose // Word sets paragraph bottom margins to zero, then inserts empty // (containing ) paragraphs to maintain the vertical spacing // expected. This is soley for its own purpose and is redundant // for HTML. This function removes margin definitions and removes // those pesky empty paragraphs. // function removeEmptyParas() { var root = dw.getDocumentDOM('document').documentElement; var style = findTag("style"); var html = null; if(style != null) { // Clean out the nonsense zero margin definitions from the // style block. html = style.innerHTML; // Just strip all of those wacky margins that Word puts in there. html = html.replace(/margin[^:]*:[^\n\r]*/g, ""); style.innerHTML = html; } // Next, go through the document and strip out those inline margins too. traverse(root, stripMargins); // Now go find those empty paragraphs and remove them. traverse(root, paraHandler); } ///////////////////////////////////////////////////////////////////////////// // Function // stripMargins // // Purpose // Word uses a lot of CSS margin settings in attempt to make the HTML // version look exactly like the native Word version. In general, this // is un wanted, so lets remove all this stuff. // function stripMargins(tag) { var style = tag.getAttribute("style"); if(style != null) { if(doShowLog()) { // Note, if there is a better way to count (if replace can be forced // to report how many replaces it did), we should do that. Because // this takes extra processing effort to count this stuff using // "match". var match; match = style.match(/margin[^"';]*;?/g); gRemoveMargins += (match != null ? match.length : 0); match = style.match(/text-indent[^"';]*;?/g); gRemoveMargins += (match != null ? match.length : 0); match = style.match(/tab-stops:[^'";]*;?/g); gRemoveMargins += (match != null ? match.length : 0); } style = style.replace(/margin[^"';]*;?/g, ""); style = style.replace(/text-indent[^"';]*;?/g, ""); style = style.replace(/tab-stops:[^'";]*;?/g, ""); if(style == null || style == "") tag.removeAttribute("style"); else tag.setAttribute("style", style); } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // paraHandler // // Purpose // Callback that looks for empty <p>'s and deletes them. After // doing some processing removing stuff, we can easily end up // with empty paragraphs. This just cleans up after ourselves. // function paraHandler(tag) { tagName = tag.tagName; if(tagName.toUpperCase() == "P") { text = tag.innerHTML; // Make sure there are not any content generating HTML tags, this // prevents us from removing say, <img> in the next step. if(containsContentTags(text)) return true; // Keep searching in the traverse // Ok, we don't have any content tags. We are save to strip any // other tags (font, b, etc). text = text.replace(/<[^>]*>/g, ""); // Strip whitespace text = text.replace(/\s/g, ""); // Strip s text = text.replace(/ /g, ""); // After doing all that, if there is nothing left, this paragraph is empty. if(text == "" || text == null) { gRemoveEmptyParas++; tag.outerHTML = ""; } } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // containsContentTags // // Purpose // Given a string that contains some HTML, check to see if we have // any tags that generate visible content. // // Returns // zero (false) if no content generating tags are found. Non-zero // value if anything is found. // function containsContentTags(text) { var index = 0; // text.search returns -1 if it does not find anything. So by adding // 1 and bitwise or-ing the result, we maintain zero if no match. index |= text.search(/<hr/i) + 1; index |= text.search(/<img/i) + 1; index |= text.search(/<input/i) + 1; index |= text.search(/<object/i) + 1; index |= text.search(/<table/i) + 1; index |= text.search(/<textarea/i) + 1; index |= text.search(/<embed/i) + 1; // if index is still zero after all that, we don't have any content tags. return index; } ///////////////////////////////////////////////////////////////////////////// // Function // formatCSS // // Purpose // When we modify or remove stuff from the <style> block we leave it in // a not so pretty state. This will clean it up so it looks nice. // function formatCSS() { var style = findTag("style"); if(style != null) { var html = style.innerHTML; // We need multiline turned on for this. var multiline = RegExp.multiline; RegExp.multiline = true; // Lets just get rid of those comments that Word puts in there html = html.replace(/\/\*.*\*\//g, ""); // Clean up the whitespace between the start and end brackets. html = html.replace(/\s*\}/g, "}"); html = html.replace(/\{\s*/g, "{"); // Make sure anything that is indented is indented only one tab. html = html.replace(/^\t+/g, "\t"); // Make sure the style names are on their own line. html = html.replace(/\}/g, "}\n"); // This will delete blank lines in the style declaration html = html.replace(/^[ \t]*(\r|\n)+/g, ""); // Set it back RegExp.multiline = multiline; style.innerHTML = html; } } ///////////////////////////////////////////////////////////////////////////// // Function // removeCSSFromTables // // Purpose // Word tends to go overboard with CSS with tables. Almost all of it // is used to maintain the "Word appearance" and is generally undesirable // for use with HTML. So, this function just strips it all. // function removeCSSFromTables() { // Find each table tag and do some processing on it. traverse(null, convertCSSInTables); } ///////////////////////////////////////////////////////////////////////////// // Function // convertCSSInTables // // Purpose // We want to strip the CSS applied to tables and their cells. // However, some of these styles can be converted into HTML // attributes. This function converts any styles we can to // HTML attributes and then removes the style tag. // function convertCSSInTables(tag) { var tagName = tag.tagName.toUpperCase(); var style; var match; if(tagName == "TABLE") { style = tag.getAttribute("style"); if(style != null && style != "") { match = style.match(/border-color: *([^;'"]*)/); if(match != null) tag.setAttribute("bordercolor", match[1]); tag.removeAttribute("style"); gRemoveTableCSS++; } } else if(tagName == "TR") { // TRs do not have any styles that we want to keep. tag.removeAttribute("style"); gRemoveTableCSS++; } else if(tagName == "TD") { style = tag.getAttribute("style"); if(style != null && style != "") { match = style.match(/background: *([^;'"]*)/); if(match != null) tag.setAttribute("bgcolor", match[1]); // Lastly, kill the style tag. tag.removeAttribute("style"); gRemoveTableCSS++; } } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // removeNonCSSDeclaration // // Purpose // Word puts a number of non-standard CSS style declarations in the // style block. This will strip them, and remove any references to // them. // function removeNonCSSDeclaration() { var root = dw.getDocumentDOM('document').documentElement; var data = new Array(); var invalidList; var style; style = findTag("style"); if(style != null) { var html = style.innerHTML; var htmlLeft, htmlRight; // First, we need to get a list of all the invalid style names. This // way, we can go through the file and remove the references to them. invalidList = html.match(/^@[^\s]* .*/g); if(invalidList != null) { // Log the number of invalid CSS styles we find. gRemoveNonCSS += invalidList.length; for(i = 0; i < invalidList.length; i++) { invalidList[i] = invalidList[i].replace(/@\w* /g, ""); invalidList[i] = invalidList[i].replace(/(\r|\n)*/g, ""); } } // This removes the invalid "@" CSS declarations html = html.replace(/^@[^}]*}/g, ""); // Now we need to go and clean out everything that referenced the // invalid styles. First lets finish cleaning the style block. if(invalidList != null) { var regx = new RegExp(); var htmlLeft, htmlRight; var result; for(i = 0; i < invalidList.length; i++) { // Find stuff of the form "div.Section1 ... { ... }" regx.compile("^.*\\."+invalidList[i]+"[^}]*}", "g"); while((result = regx.exec(html)) != null) { htmlLeft = html.substring(0, result.index); htmlRight = html.substring(result.index + result[0].length); // Remove the match html = htmlLeft + htmlRight; } } style.innerHTML = html; } else { style.innerHTML = html; } } // OK, we cleaned up the style block, now we just need to go // through the rest of the document and remove any references // to the invalid CSS classes. if(invalidList != null) { root = dw.getDocumentDOM('document').documentElement; var html = root.outerHTML; var regx = new RegExp(); for(i = 0; i < invalidList.length; i++) { regx.compile(" class=(\")?"+invalidList[i]+"\\1", "g"); while((result = regx.exec(html)) != null) { htmlLeft = html.substring(0, result.index); htmlRight = html.substring(result.index + result[0].length); html = htmlLeft + htmlRight; } } root.outerHTML = html; } } ///////////////////////////////////////////////////////////////////////////// // Function // removeMetaLink // // Purpose // Removes those nasty Microsoft-only Meta tags. // function removeMetaLink() { var head = findTag("head"); traverse(head, removeMetasHandler); } ///////////////////////////////////////////////////////////////////////////// // Function // removeMetasHandler // // Purpose // This function finds the meta tags in the document and checks // to see if they have any MS junk in them. If they do, bye bye. // function removeMetasHandler(tag, metaTags) { var tagName = tag.tagName.toUpperCase(); var html; if(tagName == "META") { html = tag.outerHTML; index = html.search(/(word|microsoft)/i); if(index != -1) { // This tag contains some Word junk, nuke it. tag.outerHTML=""; gRemoveMetaTags++; } } else if(tagName == "LINK") { var rel = tag.getAttribute("rel"); if(rel == "File-List") { tag.outerHTML = ""; gRemoveMetaTags++; } } return true; } ///////////////////////////////////////////////////////////////////////////// function setBgColor() { var body = findTag("body"); if(body != null) { var colorObj = findObject("bgcolor_basic"); var color = (colorObj == null ? null : colorObj.value); if(color != null) { body.setAttribute("bgcolor", color); gBackgroundSet = color; } } } ///////////////////////////////////////////////////////////////////////////// // Function // removeInlineCSS // // Purpose // Word 2000 loves to declare "normal" styles and then apply them to // every block in the document. Let's just set it on the body and // remove it from everything else. // function removeInlineCSS() { var style = findTag("style"); if(style != null) { var index; html = style.innerHTML; index = html.search(/\.(MsoNormal)/i); if(index != -1) { // Lets strip out the "normal" styles and make only one. Word // tends to have stuff like p.MsoNormal, li.MsoNormal, etc. html = html.replace(/^\s*\w*\.MsoNormal/ig, ".TempNormal"); // Change the first one we find to "FirstNormal" html = html.replace(/\.TempNormal/, ".FirstNormal"); // Remove the rest. html = html.replace(/\.TempNormal/g, ""); // Change the first normal to just "Normal" html = html.replace(/^.*\.FirstNormal[^\r\n]*/, ".Normal"); style.innerHTML = html; // Now we need to go and remove all references to the old class. var body = findTag("body"); html = body.innerHTML; if(doShowLog()) { var match = html.match(/ class=MsoNormal/g); gRemoveInlineCSS += (match != null ? match.length : 0); } html = html.replace(/ class=MsoNormal/g, ""); body.innerHTML = html; body.setAttribute("class", "Normal"); // Since body styles do not filter down into table cells, // we need to set the styles on the table cells too. traverse(null, setTDStyles); } } } ///////////////////////////////////////////////////////////////////////////// // Function // setTDStyles // // Purpose // We have removed the styles from individual paragraphs and the body // style does not filter down into the table cells, so we need to set // the style on the table cells too. // function setTDStyles(tag) { if(tag.tagName.toUpperCase() == "TD") { tag.setAttribute("class", "Normal"); } return true; } ///////////////////////////////////////////////////////////////////////////// // Function // removeUnusedStyles // // Purpose // After we have done all of our house cleaning, some styles defined // in the head may no longer be used anywhere. If they are no longer // used, we will blow them away. // function removeUnusedStyles() { var style = findTag("style"); var html; var classes; if(style != null) { html = style.innerHTML; // Put each style class in an array. classes = html.match(/\.?\w*\s*\{[^}]*\}/g); if(classes != null) { var classNames = new Array(classes.length); var regx = new RegExp(); // Clean up the matches so we only have the class name. for(i = 0; i < classes.length; i++) classNames[i] = classes[i].replace(/^\s*\.?(\w*)\s*\{[^}]*\}/g, "$1"); body = findTag("body"); bodyhtml = body.outerHTML; // Now search in the body to see if we use them anywhere. for(i = 0; i < classes.length; i++) { regx.compile("class=['\"]?" + classNames[i], "g"); result = regx.exec(bodyhtml); if(result == null) { // this style is not used. Nuke it. classes[i] = ""; gRemoveUnusedCSS++; } } // Now reconstruct the style block html = "\n\n"; style.innerHTML = html; } } } //*************** Pg1 Class ***************** //This is an example of a page class to be used with the TabControl. //Uncomment the alert() calls to display the various events as they occur. function Pg1(theTabLabel) { this.tabLabel = theTabLabel; } Pg1.prototype.getTabLabel = Pg1_getTabLabel; function Pg1_getTabLabel() { return this.tabLabel; } //***************** End of Pg1 Class ****************** //*************** Pg2 Class ***************** //This is an example of a page class to be used with the TabControl. //Uncomment the alert() calls to display the various events as they occur. function Pg2(theTabLabel) { this.tabLabel = theTabLabel; } Pg2.prototype.getTabLabel = Pg2_getTabLabel; function Pg2_getTabLabel() { return this.tabLabel; } //***************** End of Pg2 Class ****************** //*************** Pg3 Class ***************** //This is an example of a page class to be used with the TabControl. //Uncomment the alert() calls to display the various events as they occur. function Pg3(theTabLabel) { this.tabLabel = theTabLabel; this.loaded = false; } Pg3.prototype.getTabLabel = Pg3_getTabLabel; Pg3.prototype.canLoad = Pg3_canLoad; Pg3.prototype.unload = Pg3_unload; Pg3.prototype.lastUnload = Pg3_lastUnload; function Pg3_getTabLabel() { return this.tabLabel; } function Pg3_canLoad() { if (!this.loaded) { initDropDowns(); setDropDownStates(); this.loaded = true; } return true; } function Pg3_unload() { T.obj.visibility = "hidden"; T.obj.visibility = "visible"; return true; } function Pg3_lastUnload() { return this.canLoad(); //ensure dropdowns are initted before we do our work } //***************** End of Pg3 Class ******************